In [1]:
from utils import Activities, Users

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
pd.options.plotting.backend = "plotly"

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ptitprince as pt

from utils import join_by_fuzzy

import info_utilities

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

from pprint import pprint

import pydot
from IPython.display import Image

import warnings
#warnings.filterwarnings("ignore")
In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows    = None
In [3]:
data_folder = "../data/"

#plotly style:
line_traces = dict(mode='lines+markers',line_shape='spline',line_smoothing=1,marker_size=10,marker_opacity=0.9)

Data loading and cleaning

In [4]:
students = pd.read_csv(f"{data_folder}students.csv",sep="\t")
In [5]:
students_wo_grades = students[students[['E_grade']].isnull().any(axis=1)]
with_null = len(students_wo_grades)
print(f'There are {with_null}/{len(students)} students without estimate grade ({round(with_null/len(students)*100,2)}%)')

students.drop(students_wo_grades.index, axis=0, inplace=True)
There are 133/543 students without estimate grade (24.49%)
In [6]:
q_25 = students['E_grade'].quantile(0.25)
q_75 = students['E_grade'].quantile(0.75)
In [7]:
students['E_result'] = np.where(students['E_grade']<q_25, 'Failing', 'Average')
students['E_result'].where(students['E_grade']<q_75, 'Good',inplace=True)
grade_map = {"Good": "green", "Failing": "red", "Average": "orange"}
order_map = {"Good": 3, "Failing": 1, "Average": 2}
students['grade_order'] = students["E_result"].map(order_map).astype("float64")
students.sort_values("grade_order",inplace=True)
In [8]:
students['E_grade'].describe()
Out[8]:
count    410.000000
mean       4.673384
std        0.711358
min        0.000000
25%        4.372321
50%        4.766667
75%        5.128571
max        6.000000
Name: E_grade, dtype: float64
In [9]:
students['E_activities_per_school_year'] = students[['n_activities_school_year_1','n_activities_school_year_2','n_activities_school_year_3']].mean(axis=1)
students['E_in_curriculum_per_semester'] = students[['n_in_curriculum_semester1','n_in_curriculum_semester2','n_in_curriculum_semester3','n_in_curriculum_semester4','n_in_curriculum_semester5']].mean(axis=1)
In [10]:
exclude_cols = ['us_user','user_name','user_email','start_year','start_semester','archived','user_type','contract_type','HGF','convocatore','teacher','supervisor','ispettore','statista','student','classes','companies','avg_specific_evaluations','avg_supervisor_evaluation','n_received_feedback_requests','n_feedback_responses']
exclude_cols += ['grade_1st','grade_2nd','grade_3rd','E_grade','E_result','final_CP','final_LP','final_IP','final_grade']
exclude_cols += ['n_activities_school_year_1','n_activities_school_year_2','n_activities_school_year_3']
exclude_cols += ['n_in_curriculum_semester1','n_in_curriculum_semester2','n_in_curriculum_semester3','n_in_curriculum_semester4','n_in_curriculum_semester5']
features = list(set(students.columns) - set(exclude_cols))
In [11]:
fig = students[features].boxplot(height=800)
fig.show()

Check for correlations:

In [138]:
rain_columns = features

n_row = len(rain_columns)
f, axes = plt.subplots(n_row, 2, figsize=(18, 10*n_row), dpi=300)

for i,col_name in enumerate(rain_columns):
    row_axe = int(i/2)
    col_axe = i%2
    ax = pt.RainCloud(x = 'E_result', y = col_name, 
                  data = students, orient = 'h',
                  move = .0,alpha = .65, ax = axes[i,0], palette = "Set1")
    ax = sns.scatterplot(x="E_grade", y=col_name, hue="E_result", data=students,ax=axes[i,1], palette="Set1");
In [ ]:
corr_var = 'n_feedback_requests'
reg = LinearRegression().fit(students[corr_var].values.reshape(-1, 1) , students['E_grade'])
print(reg.coef_)

px.scatter(students, x=corr_var,y="E_grade")

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=students[corr_var], y=students['E_grade'],
                    mode='markers',
                    name='raw',
                    marker_color=students['E_result'].map(grade_map),
                ),
             )
fig.add_trace(go.Scatter(x=students[corr_var], y=reg.predict(students[corr_var].values.reshape(-1, 1)),
                    mode='lines',
                    name='predictions'))
In [ ]:
corr_var = 'n_in_curriculum'
reg = LinearRegression().fit(students[corr_var].values.reshape(-1, 1) , students['E_grade'])
print(reg.coef_)

px.scatter(students, x=corr_var,y="E_grade")

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=students[corr_var], y=students['E_grade'],
                    mode='markers',
                    name='raw'))
fig.add_trace(go.Scatter(x=students[corr_var], y=reg.predict(students[corr_var].values.reshape(-1, 1)),
                    mode='lines',
                    name='predictions'))
In [ ]:
corr_var = 'n_activities'
reg = LinearRegression().fit(students[corr_var].values.reshape(-1, 1) , students['E_grade'])
print(reg.coef_)

px.scatter(students, x=corr_var,y="E_grade")

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=students[corr_var], y=students['E_grade'],
                    mode='markers',
                    name='raw'))
fig.add_trace(go.Scatter(x=students[corr_var], y=reg.predict(students[corr_var].values.reshape(-1, 1)),
                    mode='lines',
                    name='predictions'))
In [ ]:
corr_var = 'n_folders'
reg = LinearRegression().fit(students[corr_var].values.reshape(-1, 1) , students['E_grade'])
print(reg.coef_)

px.scatter(students, x=corr_var,y="E_grade")

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=students[corr_var], y=students['E_grade'],
                    mode='markers',
                    name='raw'))
fig.add_trace(go.Scatter(x=students[corr_var], y=reg.predict(students[corr_var].values.reshape(-1, 1)),
                    mode='lines',
                    name='predictions'))